Importing required Libraries¶

In [2]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import pandas as pd
from statistics import mean,median, mode,stdev
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as stats
import plotly.express as px
import numpy as np
from scipy.stats import gaussian_kde

Creating MongoDB Atlas Connection & Retrieving Data¶

In [3]:
uri='mongodb+srv://santhosh90612-moncloudgo:Sansuganyas22@cluster0.yitq93m.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0'
client = MongoClient(uri,server_api=ServerApi('1'))
db = client.get_database('sample_airbnb')
col=db.get_collection('listingsAndReviews')
all_documents = list(col.find())
In [4]:
df=pd.DataFrame(all_documents)

Data Exploration¶

Shape¶

In [5]:
df.shape
Out[5]:
(5555, 42)

Dataset information¶

In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5555 entries, 0 to 5554
Data columns (total 42 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   _id                    5555 non-null   object        
 1   listing_url            5555 non-null   object        
 2   name                   5555 non-null   object        
 3   summary                5555 non-null   object        
 4   space                  5555 non-null   object        
 5   description            5555 non-null   object        
 6   neighborhood_overview  5555 non-null   object        
 7   notes                  5555 non-null   object        
 8   transit                5555 non-null   object        
 9   access                 5555 non-null   object        
 10  interaction            5555 non-null   object        
 11  house_rules            5555 non-null   object        
 12  property_type          5555 non-null   object        
 13  room_type              5555 non-null   object        
 14  bed_type               5555 non-null   object        
 15  minimum_nights         5555 non-null   object        
 16  maximum_nights         5555 non-null   object        
 17  cancellation_policy    5555 non-null   object        
 18  last_scraped           5555 non-null   datetime64[ns]
 19  calendar_last_scraped  5555 non-null   datetime64[ns]
 20  first_review           4167 non-null   datetime64[ns]
 21  last_review            4167 non-null   datetime64[ns]
 22  accommodates           5555 non-null   int64         
 23  bedrooms               5550 non-null   float64       
 24  beds                   5542 non-null   float64       
 25  number_of_reviews      5555 non-null   int64         
 26  bathrooms              5545 non-null   object        
 27  amenities              5555 non-null   object        
 28  price                  5555 non-null   object        
 29  security_deposit       3471 non-null   object        
 30  cleaning_fee           4024 non-null   object        
 31  extra_people           5555 non-null   object        
 32  guests_included        5555 non-null   object        
 33  images                 5555 non-null   object        
 34  host                   5555 non-null   object        
 35  address                5555 non-null   object        
 36  availability           5555 non-null   object        
 37  review_scores          5555 non-null   object        
 38  reviews                5555 non-null   object        
 39  weekly_price           714 non-null    object        
 40  monthly_price          656 non-null    object        
 41  reviews_per_month      94 non-null     float64       
dtypes: datetime64[ns](4), float64(3), int64(2), object(33)
memory usage: 1.8+ MB

Null Values¶

In [7]:
df.isnull().sum()
Out[7]:
_id                         0
listing_url                 0
name                        0
summary                     0
space                       0
description                 0
neighborhood_overview       0
notes                       0
transit                     0
access                      0
interaction                 0
house_rules                 0
property_type               0
room_type                   0
bed_type                    0
minimum_nights              0
maximum_nights              0
cancellation_policy         0
last_scraped                0
calendar_last_scraped       0
first_review             1388
last_review              1388
accommodates                0
bedrooms                    5
beds                       13
number_of_reviews           0
bathrooms                  10
amenities                   0
price                       0
security_deposit         2084
cleaning_fee             1531
extra_people                0
guests_included             0
images                      0
host                        0
address                     0
availability                0
review_scores               0
reviews                     0
weekly_price             4841
monthly_price            4899
reviews_per_month        5461
dtype: int64
  1. First last Review column is not required for our exploration so i will drop it
  2. beds, bathrooms, bedrooms, Security deposit, cleaning are may provide some insights we will find a way to fill these after we have an understanding about the dataset
  3. Weekly Price, Monthly Price, and Reviews per month are almost null so i will drop these as well.

Data Type¶

In [8]:
df.dtypes
Out[8]:
_id                              object
listing_url                      object
name                             object
summary                          object
space                            object
description                      object
neighborhood_overview            object
notes                            object
transit                          object
access                           object
interaction                      object
house_rules                      object
property_type                    object
room_type                        object
bed_type                         object
minimum_nights                   object
maximum_nights                   object
cancellation_policy              object
last_scraped             datetime64[ns]
calendar_last_scraped    datetime64[ns]
first_review             datetime64[ns]
last_review              datetime64[ns]
accommodates                      int64
bedrooms                        float64
beds                            float64
number_of_reviews                 int64
bathrooms                        object
amenities                        object
price                            object
security_deposit                 object
cleaning_fee                     object
extra_people                     object
guests_included                  object
images                           object
host                             object
address                          object
availability                     object
review_scores                    object
reviews                          object
weekly_price                     object
monthly_price                    object
reviews_per_month               float64
dtype: object

Most of the data types are objects. We will need to change for further processing and efficient storage.

Understanding all variables¶

In [9]:
pd.set_option('display.max_columns', 100)
df.head()
Out[9]:
_id listing_url name summary space description neighborhood_overview notes transit access interaction house_rules property_type room_type bed_type minimum_nights maximum_nights cancellation_policy last_scraped calendar_last_scraped first_review last_review accommodates bedrooms beds number_of_reviews bathrooms amenities price security_deposit cleaning_fee extra_people guests_included images host address availability review_scores reviews weekly_price monthly_price reviews_per_month
0 10006546 https://www.airbnb.com/rooms/10006546 Ribeira Charming Duplex Fantastic duplex apartment with three bedrooms... Privileged views of the Douro River and Ribeir... Fantastic duplex apartment with three bedrooms... In the neighborhood of the river, you can find... Lose yourself in the narrow streets and stairc... Transport: • Metro station and S. Bento railwa... We are always available to help guests. The ho... Cot - 10 € / night Dog - € 7,5 / night Make the house your home... House Entire home/apt Real Bed 2 30 moderate 2019-02-16 05:00:00 2019-02-16 05:00:00 2016-01-03 05:00:00 2019-01-20 05:00:00 8 3.0 5.0 51 1.0 [TV, Cable TV, Wifi, Kitchen, Paid parking off... 80.00 200.00 35.00 15.00 6 {'thumbnail_url': '', 'medium_url': '', 'pictu... {'host_id': '51399391', 'host_url': 'https://w... {'street': 'Porto, Porto, Portugal', 'suburb':... {'availability_30': 28, 'availability_60': 47,... {'review_scores_accuracy': 9, 'review_scores_c... [{'_id': '58663741', 'date': 2016-01-03 05:00:... NaN NaN NaN
1 10009999 https://www.airbnb.com/rooms/10009999 Horto flat with small garden One bedroom + sofa-bed in quiet and bucolic ne... Lovely one bedroom + sofa-bed in the living ro... One bedroom + sofa-bed in quiet and bucolic ne... This charming ground floor flat is located in ... There´s a table in the living room now, that d... Easy access to transport (bus, taxi, car) and ... I´ll be happy to help you with any doubts, tip... I just hope the guests treat the space as they... Apartment Entire home/apt Real Bed 2 1125 flexible 2019-02-11 05:00:00 2019-02-11 05:00:00 NaT NaT 4 1.0 2.0 0 1.0 [Wifi, Wheelchair accessible, Kitchen, Free pa... 317.00 NaN 187.00 0.00 1 {'thumbnail_url': '', 'medium_url': '', 'pictu... {'host_id': '1282196', 'host_url': 'https://ww... {'street': 'Rio de Janeiro, Rio de Janeiro, Br... {'availability_30': 0, 'availability_60': 0, '... {} [] 1492.00 4849.00 NaN
2 1001265 https://www.airbnb.com/rooms/1001265 Ocean View Waikiki Marina w/prkg A short distance from Honolulu's billion dolla... Great studio located on Ala Moana across the s... A short distance from Honolulu's billion dolla... You can breath ocean as well as aloha. Honolulu does have a very good air conditioned... Pool, hot tub and tennis We try our best at creating, simple responsive... The general welfare and well being of all the ... Condominium Entire home/apt Real Bed 3 365 strict_14_with_grace_period 2019-03-06 05:00:00 2019-03-06 05:00:00 2013-05-24 04:00:00 2019-02-07 05:00:00 2 1.0 1.0 96 1.0 [TV, Cable TV, Wifi, Air conditioning, Pool, K... 115.00 NaN 100.00 0.00 1 {'thumbnail_url': '', 'medium_url': '', 'pictu... {'host_id': '5448114', 'host_url': 'https://ww... {'street': 'Honolulu, HI, United States', 'sub... {'availability_30': 16, 'availability_60': 46,... {'review_scores_accuracy': 9, 'review_scores_c... [{'_id': '4765259', 'date': 2013-05-24 04:00:0... 650.00 2150.00 NaN
3 10021707 https://www.airbnb.com/rooms/10021707 Private Room in Bushwick Here exists a very cozy room for rent in a sha... Here exists a very cozy room for rent in a sha... Apartment Private room Real Bed 14 1125 flexible 2019-03-06 05:00:00 2019-03-06 05:00:00 2016-01-31 05:00:00 2016-01-31 05:00:00 1 1.0 1.0 1 1.5 [Internet, Wifi, Air conditioning, Kitchen, Bu... 40.00 NaN NaN 0.00 1 {'thumbnail_url': '', 'medium_url': '', 'pictu... {'host_id': '11275734', 'host_url': 'https://w... {'street': 'Brooklyn, NY, United States', 'sub... {'availability_30': 0, 'availability_60': 0, '... {'review_scores_accuracy': 10, 'review_scores_... [{'_id': '61050713', 'date': 2016-01-31 05:00:... NaN NaN NaN
4 10030955 https://www.airbnb.com/rooms/10030955 Apt Linda Vista Lagoa - Rio Quarto com vista para a Lagoa Rodrigo de Freit... Quarto com vista para a Lagoa Rodrigo de Freit... Apartment Private room Real Bed 1 1125 flexible 2019-02-11 05:00:00 2019-02-11 05:00:00 NaT NaT 2 1.0 1.0 0 2.0 [TV, Cable TV, Internet, Wifi, Air conditionin... 701.00 1000.00 250.00 0.00 1 {'thumbnail_url': '', 'medium_url': '', 'pictu... {'host_id': '51496939', 'host_url': 'https://w... {'street': 'Rio de Janeiro, Rio de Janeiro, Br... {'availability_30': 28, 'availability_60': 58,... {} [] NaN NaN NaN
  1. Few of the columns are not needed to solve our problem so i will drop those.
  2. Few columns are in array columns, we will need extract required information from these as new variable/column. It can also introduce complexities in querying and analyzing the data if we use it as array columns.

Statistical Descrption of the Data¶

In [10]:
df.describe()
Out[10]:
last_scraped calendar_last_scraped first_review last_review accommodates bedrooms beds number_of_reviews reviews_per_month
count 5555 5555 4167 4167 5555.000000 5550.000000 5542.000000 5555.000000 94.000000
mean 2019-03-01 12:48:43.420342016 2019-03-01 12:48:43.420342016 2016-11-27 16:08:28.855291648 2018-08-15 20:18:09.416846848 3.505851 1.411712 2.071454 27.606481 1.712766
min 2019-02-11 05:00:00 2019-02-11 05:00:00 2009-10-27 04:00:00 2012-01-06 05:00:00 1.000000 0.000000 0.000000 0.000000 1.000000
25% 2019-02-18 05:00:00 2019-02-18 05:00:00 2015-12-08 17:00:00 2018-08-08 04:00:00 2.000000 1.000000 1.000000 1.000000 1.000000
50% 2019-03-07 05:00:00 2019-03-07 05:00:00 2017-04-02 04:00:00 2019-01-02 05:00:00 3.000000 1.000000 2.000000 5.000000 1.000000
75% 2019-03-08 05:00:00 2019-03-08 05:00:00 2018-04-24 04:00:00 2019-02-15 05:00:00 4.000000 2.000000 3.000000 32.000000 2.000000
max 2019-03-11 04:00:00 2019-03-11 04:00:00 2019-03-10 05:00:00 2019-03-11 04:00:00 16.000000 20.000000 25.000000 533.000000 10.000000
std NaN NaN NaN NaN 2.297019 1.041942 1.619660 49.798376 1.492795

Because of incorrect data types and array columns we can't see much information here. We will check once again after we processed all these datas.

Duplicate¶

In [11]:
df.loc[df.duplicated(subset=['_id'])]
Out[11]:
_id listing_url name summary space description neighborhood_overview notes transit access interaction house_rules property_type room_type bed_type minimum_nights maximum_nights cancellation_policy last_scraped calendar_last_scraped first_review last_review accommodates bedrooms beds number_of_reviews bathrooms amenities price security_deposit cleaning_fee extra_people guests_included images host address availability review_scores reviews weekly_price monthly_price reviews_per_month

This indicates we dont have duplicate listings in our dataset.

Data Wrangling/Data Cleaning¶

Subsetting Data¶

Removing unnecessary column that are not needed for our analysis and has more than 70% null values

In [12]:
df.columns
Out[12]:
Index(['_id', 'listing_url', 'name', 'summary', 'space', 'description',
       'neighborhood_overview', 'notes', 'transit', 'access', 'interaction',
       'house_rules', 'property_type', 'room_type', 'bed_type',
       'minimum_nights', 'maximum_nights', 'cancellation_policy',
       'last_scraped', 'calendar_last_scraped', 'first_review', 'last_review',
       'accommodates', 'bedrooms', 'beds', 'number_of_reviews', 'bathrooms',
       'amenities', 'price', 'security_deposit', 'cleaning_fee',
       'extra_people', 'guests_included', 'images', 'host', 'address',
       'availability', 'review_scores', 'reviews', 'weekly_price',
       'monthly_price', 'reviews_per_month'],
      dtype='object')
In [13]:
df1=df[['_id', #'listing_url', 'name', 'summary', 'space', 'description',
       #'neighborhood_overview', 'notes', 'transit', 'access', 'interaction' 'house_rules','calendar_last_scraped',  'images',
        'property_type', 'room_type', 'bed_type',
       'minimum_nights', 'maximum_nights', 'cancellation_policy',
       'last_scraped',  'accommodates', 'bedrooms',
       'beds', 'number_of_reviews', 'bathrooms', 'amenities', 'price',
       'security_deposit', 'cleaning_fee', 'extra_people', 'guests_included',
      'host', 'address', 'availability', 'review_scores', 'reviews',
      # 'weekly_price', 'monthly_price', 'first_review', 'last_review',
       'reviews_per_month']].copy()

Preprocessing columns as per our requirement¶

Amenites¶

Changing Amenities from array columns to no of amenities which can be hlpful with comparing with other variables.

In [14]:
for i in df1['amenities'].values:
    amenities_count = 0
    amenities_count=(len(str(i).split(', ')))
df1['amenities_count'] = df1['amenities'].apply(lambda x: len(x))

Extracting address from array columns to individual columns¶

In [15]:
address_list = df1['address'].tolist()
df1['address'] = df1['address'].apply(lambda x: [x])
In [16]:
df1['administrative_division'] = df1['address'].apply(lambda x: x[0]['street'].split(', ')[0] if x is not None else None)
df1['region'] = df1['address'].apply(lambda x: x[0]['street'].split(', ')[1] if x is not None and 'street' in x[0] and len(x[0]['street'].split(', ')) >= 2 else None)
df1['country'] = df1['address'].apply(lambda x: x[0]['street'].split(', ')[-1] if x is not None and 'street' in x[0] and len(x[0]['street'].split(', ')) >= 2 else None)
df1['longitude'] = df1['address'].apply(lambda x: x[0]['location']['coordinates'][0] if x is not None else None)
df1['latitude'] = df1['address'].apply(lambda x: x[0]['location']['coordinates'][1] if x is not None else None)

Extracting Availablity & Reviws from array columns to individual column¶

In [17]:
df1['annual_availability'] = df1['availability'].apply(lambda y: y['availability_365'])
In [18]:
df1['review_scores_accuracy'] = df1['review_scores'].apply(lambda x: x.get('review_scores_accuracy', None))
df1['review_scores_cleanliness'] = df1['review_scores'].apply(lambda x: x.get('review_scores_cleanliness', None))
df1['review_scores_checkin'] = df1['review_scores'].apply(lambda x: x.get('review_scores_checkin', None))
df1['review_scores_communication'] = df1['review_scores'].apply(lambda x: x.get('review_scores_communication', None))
df1['review_scores_location'] = df1['review_scores'].apply(lambda x: x.get('review_scores_location', None))
df1['review_scores_value'] = df1['review_scores'].apply(lambda x: x.get('review_scores_value', None))
df1['review_scores_rating'] = df1['review_scores'].apply(lambda x: x.get('review_scores_rating', None))

Extracting Host details from array columns¶

In [19]:
df1['host_id']=df1['host'].apply(lambda x: x.get('host_id',None) )
df1['host_response_rate']=df1['host'].apply(lambda x: float(x.get('host_response_rate',0) ))
df1['host_total_listings_count']=df1['host'].apply(lambda x: int(x.get('host_total_listings_count',0) ))
df1['superhost']=df1['host'].apply(lambda x: x.get('host_is_superhost',False) )
df1['host_neighbourhood']=df1['host'].apply(lambda x: x.get('host_neighbourhood',False) )

Data Transfermation¶

Replacing string inconsistency in property type¶

In [20]:
propert_replace={'Guesthouse':'Guest suite',
                 'Bed and breakfast':'Hotel',
                 'Boutique hotel':'Hotel',
                 'Aparthotel':'Hotel',
                 'Nature lodge':'Condominium',
                 'Heritage hotel (India)':'Hotel',
                 'Bungalow':'House',
                 'Villa':'House',
                 'Townhouse':'House',
                 'Cottage':'Condominium',
                 'Resort':'Condominium',
                 'Casa particular (Cuba)':'Other',
                 'Farm stay':'Other',
                 'Tiny house':'Other',
                 'Camper/RV':'Other',
                 'Chalet':'Other',
                 'Earth house':'Other',
                 'Barn':'Other',
                 'Hut':'Other',
                 'Treehouse':'Other',
                 'Pension (South Korea)':'Other',
                 'Campsite':'Other',
                 'Houseboat':'Other',
                 'Castle':'Other',
                 'Train':'Other',
                 'Hostel':'Other',
                 'Train':'Other',
                 'Cabin':'Other',
                 'Boat':'Other',
                  
                }
df1['property_type'].replace(propert_replace , inplace=True)
C:\Users\sansu\AppData\Local\Temp\ipykernel_23956\2674706907.py:32: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['property_type'].replace(propert_replace , inplace=True)
In [21]:
df1['property_type'].value_counts()
Out[21]:
property_type
Apartment             3626
House                  760
Condominium            432
Serviced apartment     185
Hotel                  172
Loft                   142
Guest suite            131
Other                  107
Name: count, dtype: int64

Replacing string inconsistency in region column¶

In [22]:
replacement_dict = {
    'NY': 'New York',
    '*':'',
    'Québec':'Quebec',
    'Qc':'Quebec',
    'Cataluña':'Catalonia',
    'Catalonia':'Catalonia',
    'BARCELONA':'Catalonia',
    'Beyoğlu':'Istanbul',
    'Beyoğlu':'Istanbul',
    'Guangdong Province':'Guangdong Province',
    'Guangdong':'Guangdong Province',
    'QC':'Quebec',
    'CT':'Catalonia',
    'city':'Catalonia',
    'HI': 'Hawaii',
    
    'Hi': 'Hawaii',
    'İstanbul': 'Istanbul',
    'Portugal':'Grande Porto',
    'Porto':'Grande Porto',
    'istanbul':'Istanbul',
    'Istanbul Province,':'Istanbul',
    'Istanbul Province':'Istanbul',
    'taksim istanbul beyoglu taksim':'Istanbul',
    'Qecbec': 'Quebec',
    '香港岛': 'Hong Kong Island',
    '香港島':'Hong Kong Island',
    'Hongkong':'Hong Kong',
    '-':'Hong Kong',
    '佐敦': 'Jordan',
    '新界': 'New Territories',
    'NEW TERRITORIES':'New Territories',
    'N.T':'New Territories',
    '香港': 'Hong Kong',
    '广东省': 'Guangdong Province',
    '九龍': 'Hong Kong',
    '旺角': 'Hong Kong',
        'kowloon': 'Hong Kong',
    'Mongkok': 'Hong Kong',
    'État de Rio de Janeiro':'Rio De Janeiro',
    'State of Rio de Janeiro':'Rio De Janeiro',
    'Catalunya':'Catalonia',
'Porto District':'Grande Porto',
'City':'Catalonia',
'BARCELONA':'Catalonia',
'Barcelona':'Catalonia',
'Kowloon':'Hong Kong',
'Spain':'Catalonia',
'Aveiro':'Aveiro District',
'Turkey':'Istanbul',
'Guangdong Sheng':'Guangdong Province',
'Mong Kok':'Hong Kong',
'Beyoglu':'Istanbul',
'Taksim istanbul beyoglu taksim':'Istanbul',
'KAUAI':'Hawaii',
'Sariyer':'Istanbul',
'Vila Nova de Gaia':'Grande Porto',
'Canada':'Quebec',
'Sultanahmet':'Istanbul',
'Tsim Sha Tsui':'Hong Kong',
'US':'Hawaii',
'a tres cuadras de La Sagrada Flia.':'Catalonia',
'blue mosque area':'Istanbul',
'causeway bay':'Hong Kong',
'Santo Tirso':'Grande Porto',
'taksim \nistanbul beyoglu taksim':'Istanbul',
'İnönü Mahallesi':'Istanbul',
'Kağıthane':'Istanbul',
'Pendik':'Istanbul',
'North Point':'Hong Kong',
'Causeway Bay':'Hong Kong',
'Bahçelievler':'Istanbul',
'Bayoğlu / Taksim':'Istanbul',
'Başakşehir':'Istanbul',
'Beşiktaş/Gayrettepe':'Istanbul',
'Brazil':'Rio De Janeiro',
'Cihangir':'Istanbul',
'Cumhuriyet mahalesi':'Istanbul',
'Harbiye':'Istanbul',
'Hung Hom Bay':'Hong Kong',
'Kadikoy':'Istanbul',
'Kadıköy':'Istanbul',
'Kennedy Town':'Hong Kong Island',
'Nisantasi / Sisli':'Istanbul',
'Şişli':'Istanbul',
'N/A':'Hong Kong',

    'Rio de janeiro':'Rio De Janeiro',
    'Rio de Janeiro':'Rio De Janeiro',
    
    'RJ':'Rio De Janeiro',
    'Rj':'Rio De Janeiro',
    'Sarıyer':'Istanbul',
    'rio de janeiro':'Rio De Janeiro',
    '.':'',
    'İstanbul İli': 'Istanbul',
    'NSW':'New South Wales',
    'Sydney Australia': 'Sydney',
    'Australia':'Sydney',
    'TR':'Istanbul',
    'Küçük Ayasofya - Sultanahmet - Fatih / Istanbul / TURKIYE': 'Istanbul'
    }
df1['region'].replace(replacement_dict, inplace=True)
C:\Users\sansu\AppData\Local\Temp\ipykernel_23956\2166937456.py:104: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['region'].replace(replacement_dict, inplace=True)
In [23]:
df1.loc[(df1['region'] == '') & (df1['country'] == 'Hong Kong'), 'region'] = 'Hong Kong'
df1.loc[(df1['region'] == '') & (df1['country'] == 'Portugal'), 'region'] = 'Grande Porto'
df1['region'] = df1['region'].str.strip()

Replacing string inconsistency in Cancellation Policy column¶

In [24]:
Cancel_replace = {
    'strict_14_with_grace_period': 'Strict_14'}
df1['cancellation_policy'].replace(Cancel_replace, inplace=True)
C:\Users\sansu\AppData\Local\Temp\ipykernel_23956\454038236.py:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['cancellation_policy'].replace(Cancel_replace, inplace=True)
In [25]:
df1.cancellation_policy.value_counts()
Out[25]:
cancellation_policy
Strict_14          2420
flexible           1682
moderate           1336
super_strict_60      79
super_strict_30      38
Name: count, dtype: int64

Subsetting column After preprocessing¶

In [26]:
df1= df1[['_id', 'property_type', 'room_type', 'bed_type', 'minimum_nights',
       'maximum_nights', 'cancellation_policy', 'last_scraped', 'accommodates',
       'bedrooms', 'beds', 'number_of_reviews', 'bathrooms','price',
 #'amenities' ,'host', 'address', 'availability', 'review_scores','reviews_per_month','reviews',
     'security_deposit', 'cleaning_fee', 'extra_people',
       'guests_included',   'amenities_count',
       'administrative_division', 'region', 'country', 'longitude', 'latitude',
       'annual_availability', 'review_scores_accuracy',
       'review_scores_cleanliness', 'review_scores_checkin',
       'review_scores_communication', 'review_scores_location',
       'review_scores_value', 'review_scores_rating', 'host_id',
       'host_response_rate', 'host_total_listings_count', 'superhost',
       'host_neighbourhood']].copy()

Treating Missing Values¶

Converting Dtype and Checking data distribution¶

In [63]:
# sns.boxplot(data=miss_df,x='bedrooms')
# plt.show()
# sns.boxplot(data=miss_df,x='beds')
# plt.show()
# sns.boxplot(data=miss_df,x='bathrooms')
# plt.show()
# sns.boxplot(data=miss_df,x='security_deposit')
# plt.show()
# sns.boxplot(data=miss_df,x='cleaning_fee')
# plt.show()
# sns.boxplot(data=miss_df,x='price')
# plt.show()
In [64]:
miss_df=df1[['bedrooms', 'beds', 'bathrooms', 'security_deposit', 'cleaning_fee','price',]].copy()
miss_df['bathrooms'] = miss_df['bathrooms'].astype(str).astype(float)  
miss_df['security_deposit'] = miss_df['security_deposit'].astype(str).astype(float)  
miss_df['cleaning_fee'] = miss_df['cleaning_fee'].astype(str).astype(float)  
miss_df['price'] = miss_df['price'].astype(str).astype(float).astype(int)


for col in miss_df:
    
    plt.figure(figsize=(16,4))
    plt.subplot(121)
    sns.boxplot(data=miss_df,x=col)
    plt.subplot(122)
    sns.histplot(data=miss_df,x=col,kde=True)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

All Variables are right skwed and have outliers. So Median will be the right option to fill missing value

  1. Filling Bed rooms based on filtering the bed type using Median
  2. Filling beds based on the bed rooms offering.
  3. Filling bathrooms as listings offering no beds extremely rare.
  4. Filling Security deposits with 0 value
  5. Reviews i put -1 as there are no reviews for few columns. -1 represents no reviews
  6. Filling cleaning fee with as there will be a cleaning charge
  7. I drop rows with missing values in columns: 'reviews_per_month','review_scores_rating','review_scores_accuracy'

Treating missing value¶

In [72]:
bed_null=df1.query('bed_type=="Real Bed"').groupby('bed_type')['bedrooms'].median()
df1['bedrooms'].fillna(bed_null['Real Bed'], inplace=True)
df1['beds'].fillna(df['bedrooms'], inplace=True)
df1['beds'].fillna(0, inplace=True)
df1['bathrooms'].fillna(1, inplace=True)
df1['security_deposit'].fillna(0, inplace=True)
df1['review_scores_accuracy'].fillna(-1,inplace=True)  
df1['review_scores_cleanliness'].fillna(-1,inplace=True)  
df1['review_scores_checkin'].fillna(-1,inplace=True)  
df1['review_scores_communication'].fillna(-1,inplace=True)  
df1['review_scores_location'].fillna(-1,inplace=True)  
df1['review_scores_value'].fillna(-1,inplace=True)  
df1['review_scores_rating'].fillna(-1,inplace=True)  
C:\Users\sansu\AppData\Local\Temp\ipykernel_23956\3138686020.py:1: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  bed_null=df1.query('bed_type=="Real Bed"').groupby('bed_type')['bedrooms'].median()
C:\Users\sansu\AppData\Local\Temp\ipykernel_23956\3138686020.py:2: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['bedrooms'].fillna(bed_null['Real Bed'], inplace=True)
C:\Users\sansu\AppData\Local\Temp\ipykernel_23956\3138686020.py:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['beds'].fillna(df['bedrooms'], inplace=True)
C:\Users\sansu\AppData\Local\Temp\ipykernel_23956\3138686020.py:4: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['beds'].fillna(0, inplace=True)
C:\Users\sansu\AppData\Local\Temp\ipykernel_23956\3138686020.py:5: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['bathrooms'].fillna(1, inplace=True)
C:\Users\sansu\AppData\Local\Temp\ipykernel_23956\3138686020.py:6: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['security_deposit'].fillna(0, inplace=True)
C:\Users\sansu\AppData\Local\Temp\ipykernel_23956\3138686020.py:7: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['review_scores_accuracy'].fillna(-1,inplace=True)
C:\Users\sansu\AppData\Local\Temp\ipykernel_23956\3138686020.py:8: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['review_scores_cleanliness'].fillna(-1,inplace=True)
C:\Users\sansu\AppData\Local\Temp\ipykernel_23956\3138686020.py:9: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['review_scores_checkin'].fillna(-1,inplace=True)
C:\Users\sansu\AppData\Local\Temp\ipykernel_23956\3138686020.py:10: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['review_scores_communication'].fillna(-1,inplace=True)
C:\Users\sansu\AppData\Local\Temp\ipykernel_23956\3138686020.py:11: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['review_scores_location'].fillna(-1,inplace=True)
C:\Users\sansu\AppData\Local\Temp\ipykernel_23956\3138686020.py:12: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['review_scores_value'].fillna(-1,inplace=True)
C:\Users\sansu\AppData\Local\Temp\ipykernel_23956\3138686020.py:13: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['review_scores_rating'].fillna(-1,inplace=True)
In [73]:
df1['cleaning_fee'] = df1['cleaning_fee'].astype('str').astype(float)
overall_mean = df1.loc[~df1['cleaning_fee'].isnull(), 'cleaning_fee'].median()
df1['cleaning_fee'].fillna(overall_mean, inplace=True)
C:\Users\sansu\AppData\Local\Temp\ipykernel_23956\3208852272.py:3: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['cleaning_fee'].fillna(overall_mean, inplace=True)
In [74]:
df1.isna().sum().sum()
Out[74]:
0

Treated all null values and the null count is zero.

After changing Data types we reduced data frame size to 310 % eventhough we added new variables and filling Missing values.

Chaging Data Type¶

In [75]:
df1['_id']=df1['_id'].astype('int32')
df1['property_type']=df1['property_type'].astype('category')
df1['room_type']=df1['room_type'].astype('category')
df1['host_neighbourhood']=df1['host_neighbourhood'].astype('category')
df1['bed_type']=df1['bed_type'].astype('category')
df1['minimum_nights']=df1['minimum_nights'].astype('int32')
df1['maximum_nights']=df1['maximum_nights'].astype('int32')
df1['cancellation_policy']=df1['cancellation_policy'].astype('category')
df1['accommodates']=df1['accommodates'].astype('int8')
df1['bedrooms']=df1['bedrooms'].astype('int8')
df1['beds']=df1['beds'].astype('int8')
df1['number_of_reviews']=df1['number_of_reviews'].astype('int16')
df1['bathrooms'] = df1['bathrooms'].astype(str).astype('float')
df1['price']=df1['price'].astype(str).astype(float).astype('float32')
df1['security_deposit']=df1['security_deposit'].astype(str).astype(float).astype('float32')
df1['cleaning_fee']=df1['cleaning_fee'].astype(str).astype(float).astype('float32')
df1['extra_people']=df1['extra_people'].astype('str').astype(float).astype('int8')
df1['guests_included']=df1['guests_included'].astype('str').astype(float).astype('int8')
df1['administrative_division']=df1['administrative_division'].astype('category')
df1['region']=df1['region'].astype('category')
df1['country']=df1['country'].astype('category')
df1['longitude']=df1['longitude'].astype('float')
df1['latitude']=df1['latitude'].astype('float')
df1['host_id']=df1['host_id'].astype('str')
df1['host_total_listings_count']=df1['host_total_listings_count'].astype('int32')
df1['amenities_count']=df1['amenities_count'].astype('int8')
df1['annual_availability']=df1['annual_availability'].astype('int16')
df1['host_id']=df1['host_id'].astype('str')
df1['review_scores_accuracy']=df1['review_scores_accuracy'].astype('float16')
df1['review_scores_cleanliness']=df1['review_scores_cleanliness'].astype('float16')
df1['review_scores_checkin']=df1['review_scores_checkin'].astype('float16')
df1['review_scores_communication']=df1['review_scores_communication'].astype('float16')
df1['review_scores_location']=df1['review_scores_location'].astype('float16')
df1['review_scores_value']=df1['review_scores_value'].astype('float16')
df1['review_scores_rating']=df1['review_scores_rating'].astype('float16')
df1['host_response_rate']=df1['host_response_rate'].astype('float16')
df1['last_scraped'] = pd.to_datetime(df1['last_scraped']).dt.strftime('%Y-%m-%d')
In [76]:
df1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5555 entries, 0 to 5554
Data columns (total 37 columns):
 #   Column                       Non-Null Count  Dtype   
---  ------                       --------------  -----   
 0   _id                          5555 non-null   int32   
 1   property_type                5555 non-null   category
 2   room_type                    5555 non-null   category
 3   bed_type                     5555 non-null   category
 4   minimum_nights               5555 non-null   int32   
 5   maximum_nights               5555 non-null   int32   
 6   cancellation_policy          5555 non-null   category
 7   last_scraped                 5555 non-null   object  
 8   accommodates                 5555 non-null   int8    
 9   bedrooms                     5555 non-null   int8    
 10  beds                         5555 non-null   int8    
 11  number_of_reviews            5555 non-null   int16   
 12  bathrooms                    5555 non-null   float64 
 13  price                        5555 non-null   float32 
 14  security_deposit             5555 non-null   float32 
 15  cleaning_fee                 5555 non-null   float32 
 16  extra_people                 5555 non-null   int8    
 17  guests_included              5555 non-null   int8    
 18  amenities_count              5555 non-null   int8    
 19  administrative_division      5555 non-null   category
 20  region                       5555 non-null   category
 21  country                      5555 non-null   category
 22  longitude                    5555 non-null   float64 
 23  latitude                     5555 non-null   float64 
 24  annual_availability          5555 non-null   int16   
 25  review_scores_accuracy       5555 non-null   float16 
 26  review_scores_cleanliness    5555 non-null   float16 
 27  review_scores_checkin        5555 non-null   float16 
 28  review_scores_communication  5555 non-null   float16 
 29  review_scores_location       5555 non-null   float16 
 30  review_scores_value          5555 non-null   float16 
 31  review_scores_rating         5555 non-null   float16 
 32  host_id                      5555 non-null   object  
 33  host_response_rate           5555 non-null   float16 
 34  host_total_listings_count    5555 non-null   int32   
 35  superhost                    5555 non-null   bool    
 36  host_neighbourhood           5555 non-null   category
dtypes: bool(1), category(8), float16(8), float32(3), float64(3), int16(2), int32(4), int8(6), object(2)
memory usage: 611.9+ KB

Statistical Descrption of the Data¶

In [77]:
df1.describe()
C:\Users\sansu\AppData\Local\Programs\Python\Python311\Lib\site-packages\numpy\core\_methods.py:49: RuntimeWarning: overflow encountered in reduce
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
C:\Users\sansu\AppData\Local\Programs\Python\Python311\Lib\site-packages\numpy\core\_methods.py:49: RuntimeWarning: overflow encountered in reduce
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
Out[77]:
_id minimum_nights maximum_nights accommodates bedrooms beds number_of_reviews bathrooms price security_deposit cleaning_fee extra_people guests_included amenities_count longitude latitude annual_availability review_scores_accuracy review_scores_cleanliness review_scores_checkin review_scores_communication review_scores_location review_scores_value review_scores_rating host_response_rate host_total_listings_count
count 5.555000e+03 5555.000000 5.555000e+03 5555.000000 5555.000000 5555.000000 5555.000000 5555.000000 5555.000000 5555.000000 5555.000000 5555.000000 5555.000000 5555.000000 5555.000000 5555.000000 5555.000000 5555.000000 5555.000000 5555.000000 5555.000000 5555.000000 5555.000000 5555.00 5555.000 5555.000000
mean 1.664386e+07 5.564356 1.382776e+06 3.505851 1.411341 2.068587 27.606481 1.290639 278.766144 318.313782 84.683525 11.593339 1.747435 21.854545 -6.623240 22.061230 173.105671 6.750000 6.578125 6.859375 6.855469 6.785156 6.566406 inf inf 14.405761
std 9.622558e+06 22.613861 5.256920e+07 2.297019 1.041547 1.619052 49.798376 0.701741 842.215515 1284.484741 95.020538 28.717417 1.487352 11.400905 90.525974 28.092112 139.841893 4.726562 4.648438 4.773438 4.769531 4.726562 4.621094 42.25 43.375 65.848868
min 1.170800e+04 1.000000 1.000000e+00 1.000000 0.000000 0.000000 0.000000 0.000000 9.000000 0.000000 0.000000 -126.000000 1.000000 1.000000 -159.678690 -34.088290 0.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.00 0.000 0.000000
25% 8.376039e+06 1.000000 5.900000e+01 2.000000 1.000000 1.000000 1.000000 1.000000 70.000000 0.000000 35.000000 0.000000 1.000000 13.000000 -73.599530 20.724390 17.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.00 0.000 1.000000
50% 1.711345e+07 2.000000 1.125000e+03 3.000000 1.000000 2.000000 5.000000 1.000000 129.000000 0.000000 60.000000 0.000000 1.000000 20.000000 -8.611420 40.727130 171.000000 10.000000 9.000000 10.000000 10.000000 10.000000 9.000000 92.00 100.000 2.000000
75% 2.461229e+07 3.000000 1.125000e+03 4.000000 2.000000 3.000000 32.000000 1.000000 280.000000 250.000000 100.000000 20.000000 2.000000 28.000000 29.019605 41.162755 317.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 98.00 100.000 6.000000
max 3.295874e+07 1250.000000 2.147484e+09 16.000000 20.000000 25.000000 533.000000 16.000000 48842.000000 39228.000000 2000.000000 127.000000 16.000000 76.000000 151.338980 45.665630 365.000000 10.000000 10.000000 10.000000 10.000000 10.000000 10.000000 100.00 100.000 1198.000000
  1. Maximun night columns have invalid or extrame outlier which needs to be treated.
  2. On an average 173 listings available in a year.
  3. Average Security Deposit is more than the average price.
  4. On an average 21 amenties are provided.
  5. On an average a host list 14 properties

Checking the number of unique values of each column¶

In [78]:
df1.nunique()
Out[78]:
_id                            5555
property_type                     8
room_type                         3
bed_type                          5
minimum_nights                   45
maximum_nights                  140
cancellation_policy               5
last_scraped                      7
accommodates                     16
bedrooms                         13
beds                             19
number_of_reviews               259
bathrooms                        17
price                           649
security_deposit                213
cleaning_fee                    291
extra_people                    122
guests_included                  14
amenities_count                  73
administrative_division         564
region                           15
country                           9
longitude                      5348
latitude                       5355
annual_availability             366
review_scores_accuracy           10
review_scores_cleanliness         9
review_scores_checkin            10
review_scores_communication      10
review_scores_location            9
review_scores_value              10
review_scores_rating             42
host_id                        5104
host_response_rate               62
host_total_listings_count       132
superhost                         2
host_neighbourhood              447
dtype: int64

Finding Outliers using plots¶

In [79]:
fea='price','security_deposit','cleaning_fee','accommodates','bedrooms','beds','minimum_nights','maximum_nights'
sns.set_style('dark')
for col in fea:
    # df1[col] = df1[col].astype(float)
    print(col)
    plt.figure(figsize=(16,4))
    plt.subplot(131)
    sns.histplot(df1[col],kde=True,bins=50,label="skew: " +str(np.round(df1[col].skew(),2)))
    plt.legend()
    plt.subplot(132)
    sns.boxplot(x=df1[col])
    plt.subplot(133)
    stats.probplot(df1[col],plot=plt)
    # plt.tight_layout()
    plt.show()
price
No description has been provided for this image
security_deposit
No description has been provided for this image
cleaning_fee
No description has been provided for this image
accommodates
No description has been provided for this image
bedrooms
No description has been provided for this image
beds
No description has been provided for this image
minimum_nights
No description has been provided for this image
maximum_nights
No description has been provided for this image

Capping Outliers with IQR method and with boundries 1.5.¶

Note- We no need to treat outliers using capping method since our problem statement is to vizualize the data rather creating predictive ML models. I am doing it just for an experience. In Future after i learn ML techniques, probably i create ML models with using this project.¶

  1. We see from the summary statistics, feature variables/columns are extrmely right skewed due to extrame high price of few listings. Let's not remove data rather we use IQR method to fill outliers to boundries value in price variable.
  2. We can see few errors in Minimum and Maximum nights. Minimum 1200 nights and maximum with 10 digit number which is not obvious. We cannot impute this using IQR method. I checked online and understood that hosts may offers 999 on an average for maximum nights. Some of the listings set their minumum stay for a month or two due to demand, pricing strategy, long term stays (student, temporary housing, etc) and other criteria.
  3. So my decision after understading the Minimum and Maximum stays over online and information from our data set are minimum nights can be up to 40 to 60 days. Maximun night is 999 days however our variable mode is 1125 so i decided to stick with 1125. I go over dataset rows which are grater than 1125 in Maximum nights and grater than 40 days in minimum nights.
In [80]:
fea='price','security_deposit','cleaning_fee'
df1=df1.copy()
def iqr_cap(df1,cols,fact):
    for col in cols:
        q1=df1[col].quantile(0.25)
        q3=df1[col].quantile(0.75)
        iqr=q3-q1
        up_fence=q3+(fact*iqr)
        low_fence=q1-(fact*iqr)
        print(f'Upper Boundry for {col} - ',up_fence)
        print(f'Lower Boundry for {col}- ',low_fence)
        df1[col] = np.where(df1[col]>=up_fence,up_fence,np.where(df1[col]<=low_fence,low_fence,df1[col]))
In [38]:
iqr_cap(df1,fea,1.5)
Upper Boundry for price -  601.25
Lower Boundry for price-  -248.75
Upper Boundry for security_deposit -  625.0
Lower Boundry for security_deposit-  -375.0
Upper Boundry for cleaning_fee -  197.5
Lower Boundry for cleaning_fee-  -62.5

Treating outliers/errors in the maximun and maximun night variable.¶

We can see few errors in Minimum and Maximum nights. Minimum 1200 nights and maximum with 10 digit number which is not obvious. We cannot impute this using IQR method. I checked online and understood that hosts may offers 999 on an average for maximum nights. Some of the listings set their minumum stay for a month or two due to demand, pricing strategy, long term stays (student, temporary housing, etc) and other criteria.

So my decision after understading the Minimum and Maximum stays over online and information from our data set are minimum nights can be up to 40 to 60 days. Maximun night is 999 days however our variable mode is 1125 so i decided to stick with 1125. I go over dataset rows which are grater than 1125 in Maximum nights and grater than 40 days in minimum nights.

In [82]:
filtered_df_max = df1[df1.maximum_nights > 2000]
count = filtered_df_max.shape[0]
df1 = df1.drop(df1[df1.maximum_nights > 2000].index)
In [83]:
filtered_df_min = df1[df1.minimum_nights > 60]
filtered_df_min.count()
count = filtered_df_min.shape[0]
df1 = df1.drop(df1[df1.minimum_nights > 60].index)
In [84]:
sns.set_style('dark')
fea='price','security_deposit','cleaning_fee','accommodates','bedrooms','beds','minimum_nights','maximum_nights'
for col in fea:
    plt.figure(figsize=(16,4))
    plt.subplot(131)
    sns.histplot(df1[col],kde=True,label="skew: " +str(np.round(df1[col].skew(),2)))
    plt.legend()
    plt.subplot(132)
    sns.boxplot(x=df1[col])
    plt.subplot(133)
    stats.probplot(df1[col],plot=plt)
    plt.tight_layout()
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Exploratory Data Analysis:¶

Correlation Analysis, Visualizations and Statistical Analysis

Room Type and countries¶

In [85]:
fig, axs = plt.subplots(1, 2, figsize=(16, 4), gridspec_kw={'width_ratios': [6, 10]})
axs[0].set_title('Listing Count by Room Type')
sns.countplot(x=df1['room_type'],hue=df1.room_type,stat="percent", ax=axs[0])
sns.countplot(x=df1['room_type'],hue=df1.country,stat="percent", ax=axs[1])
axs[1].set_title('Listings Count by room type and country')
plt.show()
No description has been provided for this image
  1. Most of listings are Entire Home/Apartment follwed by Provate Room and Shared Room
  2. United States holds 50% higher listings in Entire Home/Apartment Room Type comparing other countries
  3. All countries listings are more or less equal in Private Room
  4. China is less contributor when comparing all countries

Listing price in terms cancellation policy, accomadates with Room Type¶

In [86]:
fig, axs = plt.subplots(1, 2, figsize=(16,8), gridspec_kw={'width_ratios': [8, 10]})
axs[0].set_title('Listing Count by Room Type')
sns.scatterplot(data=df1,x='cancellation_policy',y='price',hue='room_type',ax=axs[0])

axs[0].tick_params(axis='x', rotation=10)
sns.scatterplot(data=df1, x="accommodates", y="price", hue="room_type", ax=axs[1])
axs[1].set_title('Listings Count by room type and country')
plt.show()
No description has been provided for this image
  1. From the left plot, we can see that the most of the listings are booked under Strict 14 cancellation policy type
  2. From th right side plot, we can see that one accomadate listings are mostly Private rooms
  3. From th right side plot, we can see that the most of the listings are booked with 2 accomadates.

Top 10 Hosts and Super Host interms of listing count¶

In [87]:
fig, axs = plt.subplots(1, 2, figsize=(16, 4), gridspec_kw={'width_ratios': [6, 10]})
axs[0].set_title('Superhost count in terms of total listings')
top_superhost = df1.groupby('superhost')['host_total_listings_count'].sum()
sns.barplot(x=top_superhost.values, y=top_superhost.index, orient='h',ax=axs[0]) 

top_host = df1.groupby('host_id')['host_total_listings_count'].sum().nlargest(10)
top_host_superhost = df1[df1['host_id'].isin(top_host.index)]['superhost']
colors = ['green' if sh else 'red' for sh in top_host_superhost]
sns.barplot(x=top_host.values, y=top_host.index, orient='h',hue=top_host.index,palette=colors,ax=axs[1]) 
plt.xlabel('Total Listings Count')
plt.ylabel('Host ID')
axs[1].set_title('Top 10 Hosts by Total Listings Count')
plt.show()
C:\Users\sansu\AppData\Local\Temp\ipykernel_23956\575269493.py:9: UserWarning: The palette list has more values (74) than needed (10), which may not be intended.
  sns.barplot(x=top_host.values, y=top_host.index, orient='h',hue=top_host.index,palette=colors,ax=axs[1])
No description has been provided for this image
  1. 80% of listings done by normal host
  2. Top hosts in terms of total listing is also normal hosts

Cancellation Policy and Listings¶

In [88]:
import seaborn as sns
import matplotlib.pyplot as plt

cancellation_order = df1['cancellation_policy'].value_counts().index

# Create subplots
fig, axs = plt.subplots(1, 2, figsize=(16, 4), gridspec_kw={'width_ratios': [6, 10]})

# First subplot
axs[0].set_title('Listing Count by Cancellation Policy')
sns.countplot(x=df1['cancellation_policy'], hue=df1.cancellation_policy, order=cancellation_order, ax=axs[0])
axs[0].tick_params(axis='x', rotation=10)  # Rotate x-axis labels

# Second subplot
axs[1].set_title('Listing count by Cancellation Policy and Room Type')
sns.countplot(x=df1['cancellation_policy'], hue=df1.room_type, order=cancellation_order, ax=axs[1])
axs[1].tick_params(axis='x', rotation=10)  # Rotate x-axis labels

plt.show()
No description has been provided for this image
  1. Most listings are from strict 14 cancellation policy
  2. Most of the Private rooms are booked under flexible cancellation policy
  3. 30 days cancellation is lesser than comparing all.

Top listings in terms of country¶

In [89]:
plt.figure(figsize=(15,6))
shape = df1.country.value_counts().values
labels = df1.country.value_counts().index
plt.pie(x=shape,labels=shape,explode=[.1] * len(shape),shadow=True, autopct = '%1.1f%%', startangle=90)
plt.legend(labels,loc="center left", bbox_to_anchor=(1, 0.5))
plt.show()
No description has been provided for this image
  1. US is listed 21% of properties and holds predominant place when compare to other countries.
  2. Followed by US, all other countries listed almost equally except China as it holds 0.3%.

Average price, Security Deposit, Cleasing Fee for each room type¶

In [90]:
df_price_room_type = pd.DataFrame(df1.groupby(['room_type'],observed=False).agg({'price': 'mean', 'cleaning_fee': 'mean','security_deposit':'mean'}).sort_values(by='price'))
plt.figure(figsize=(16,4))
plt.subplot(131)
plt.title('Average price by room_type')
sns.barplot(data=df_price_room_type,x='room_type',y='price',hue='room_type')
plt.legend('')
plt.subplot(132)
plt.title('Average cleaning_fee by room_type')
sns.barplot(data=df_price_room_type,x='room_type',y='cleaning_fee',hue='room_type')
plt.legend('')
plt.subplot(133)
plt.title('Average security_deposit by room_type')
sns.barplot(data=df_price_room_type,x='room_type',y='security_deposit',hue='room_type')
plt.legend('')
plt.show()
No description has been provided for this image
  1. Eventhough shared room has less numbers when compared to others it's average price, cleaning fee are high but security deposit is less.
  2. Average Security Deposit it almost equal to Average price. We may also consider that not all the properties requesting Security Deposit. The average is derrived from the listings that are requested Security Deposit.
  3. Entire home/apt holds predominant place which seems obvious. It is higher count.

Comparison of Price Distributions and Density of Price Ranges for Each Room Type¶

In [91]:
plt.figure(figsize=(14,6))
sns.kdeplot(df1[df1.room_type=='Entire home/apt'].price,color='maroon',label='Entire home/apt',linewidth=2.5)
sns.kdeplot(df1[df1.room_type=='Private room'].price,color='black',label='Private room',linewidth=2.5)
sns.kdeplot(df1[df1.room_type=='Shared room'].price,color='green',label='Shared room',linewidth=2.5)
plt.title('Room Type VS price distribution for price')
plt.xlim(0,800)
plt.show()
No description has been provided for this image

Comparison of Price Distributions and Density of Price Ranges for Each top 5 region¶

Price distribution in terms of room type and Cancellation poli¶

In [92]:
fig, axs = plt.subplots(1, 2, figsize=(16, 5))
plt.subplot(121)
sns.histplot(data=df1,x='price',bins=30,kde=True,hue='room_type')
plt.subplot(122)
sns.histplot(data=df1,x='price',kde=True,hue='cancellation_policy')
plt.show()
No description has been provided for this image
  1. Entire home/apt tend to high proce followed by Private Room. Shared room very less comparitively Entire home and Private room.
  2. Strict 14 and Flexible cancellation type properties are listed most frequestly.

Minimum & maximun Nights by room type¶

In [93]:
min_max_ni8 = pd.DataFrame(df1.groupby(['room_type'],observed=False).agg({'minimum_nights': 'mean','maximum_nights':'mean'}))
plt.figure(figsize=(11,4))
plt.subplot(121)
plt.title('Average Minimum Nights by room_type')
sns.barplot(data=min_max_ni8,x='room_type',y='minimum_nights',hue='room_type')
plt.legend('')
plt.subplot(122)
plt.title('Average Maximum Nights by room_type')
sns.barplot(data=min_max_ni8,x='room_type',y='maximum_nights',hue='room_type')
plt.legend('')
plt.show()
No description has been provided for this image
  1. Average Minumum Night for Entire home/apt is aroun 5 days and Maximun night is around 700 days
  2. Rememeber, as we discussed about these variables in Outlier Treatment, the minimum and Maximum stays may vary depends on several criteria and property type.

Reviews count interms of Cancellation policy and Room type¶

In [94]:
min_max_ni8 = pd.DataFrame(df1.groupby(['cancellation_policy','room_type'], observed=False)['number_of_reviews'].sum()).reset_index()
min_max_ni8 = min_max_ni8.sort_values(by='number_of_reviews', ascending=False)

plt.figure(figsize=(12, 5))

plt.subplot(121)
sns.barplot(data=min_max_ni8, x='cancellation_policy', y='number_of_reviews', order=min_max_ni8['cancellation_policy'],hue='room_type')
plt.title('Total Number of Reviews by Cancellation Policy')
plt.xlabel('Cancellation Policy')
plt.ylabel('Total Number of Reviews')
plt.show()
No description has been provided for this image
  1. Strict 14 holds predominant number of reviews
  2. Entire home/apt has more number of reviews in all aspects

Reviews count interms of Country and Room type¶

In [95]:
Region = pd.DataFrame(df1.groupby(['country','room_type'], observed=False)['number_of_reviews'].sum()).reset_index()
Region = Region.sort_values(by='number_of_reviews', ascending=False)

plt.figure(figsize=(12, 6))

sns.barplot(data=Region, y='country', x='number_of_reviews', order=Region['country'], orient='h',hue='room_type')
plt.title('Total Number of Reviews by Country and Room Type')
plt.xlabel('Total Number of Reviews')
plt.ylabel('Country')
plt.show()
No description has been provided for this image
  1. US holds predominant number of reviews
  2. Entire home/apt has more number of reviews in all aspects

Spatial Distribution of Listings in terms of top 10 Regions¶

In [ ]:
 
In [96]:
import folium


top_map = df1.groupby('region', observed=False).agg({'latitude': 'first', 'longitude': 'first', 'price': 'sum'}).reset_index()
top_5_map=top_map.sort_values(by='price')

maps = []
# Iterate through each region
for index, row in top_5_map.iterrows():
    region_listings = df1[df1['region'] == row['region']]
    maps.append(row['region'])
    # Create a map centered around the mean latitude and longitude of the region
    m = folium.Map(location=[row['latitude'], row['longitude']], zoom_start=9)

        
    # Filter the DataFrame to get listings in the current region
    
    
    # Add scatter markers for each listing in the region
    for _, listing in region_listings.iterrows():
        
        folium.Marker(location=[listing['latitude'], listing['longitude']],popup=row['region']).add_to(m)
    
    # Add the map to the list of maps
    maps.append(m)

# Display all the maps in the list
for m in maps:
    display(m)
'Sydney'
Make this Notebook Trusted to load map: File -> Trust Notebook
'Jordan'
Make this Notebook Trusted to load map: File -> Trust Notebook
'Aveiro District'
Make this Notebook Trusted to load map: File -> Trust Notebook
'Guangdong Province'
Make this Notebook Trusted to load map: File -> Trust Notebook
'Grande Porto'
Make this Notebook Trusted to load map: File -> Trust Notebook
'New Territories'
Make this Notebook Trusted to load map: File -> Trust Notebook
'Catalonia'
Make this Notebook Trusted to load map: File -> Trust Notebook
'Quebec'
Make this Notebook Trusted to load map: File -> Trust Notebook
'New York'
Make this Notebook Trusted to load map: File -> Trust Notebook
'New South Wales'
Make this Notebook Trusted to load map: File -> Trust Notebook
'Hawaii'
Make this Notebook Trusted to load map: File -> Trust Notebook
'Hong Kong Island'
Make this Notebook Trusted to load map: File -> Trust Notebook
'Hong Kong'
Make this Notebook Trusted to load map: File -> Trust Notebook
'Istanbul'
Make this Notebook Trusted to load map: File -> Trust Notebook
'Rio De Janeiro'
Make this Notebook Trusted to load map: File -> Trust Notebook

From above plot i could see that the 99% listings are nearby Beach, Lake, and River.

In [97]:
df_top_region = df1.groupby('region',observed=False).size().reset_index(name='size')
df_top_region=df_top_region.sort_values(by='size',ascending=False).nlargest(10,'size')

for index, row in df_top_region.iterrows():
    region = row['region']
    dff = df1[df1['region'] == region]  # Filter DataFrame for the current region
    lat = dff['latitude'] 
    lon = dff['longitude']
    
    xy = np.vstack([lat, lon])
    z = gaussian_kde(xy)(xy)
    idx = z.argsort()
    z = z[idx]
    
    plt.figure(figsize=(12, 6))
    plt.scatter(lon, lat, c=z, s=15,cmap='magma',marker="p")
    plt.xlabel("Longitude")
    plt.ylabel("Latitude")
    plt.title(f"Density of listings in {region} ({row.values[1]} Listings)")
    print()
    plt.show()

No description has been provided for this image

No description has been provided for this image

No description has been provided for this image

No description has been provided for this image

No description has been provided for this image

No description has been provided for this image

No description has been provided for this image

No description has been provided for this image

No description has been provided for this image

No description has been provided for this image

Listings Availability¶

In [98]:
df1['annual_availability'].describe()
Out[98]:
count    5506.000000
mean      172.772975
std       139.919947
min         0.000000
25%        16.000000
50%       170.000000
75%       317.000000
max       365.000000
Name: annual_availability, dtype: float64
In [99]:
df1['annual_availability'].value_counts().head(10)
Out[99]:
annual_availability
0      1188
365     406
364     121
90       67
179      66
363      54
89       52
362      52
180      49
358      46
Name: count, dtype: int64

Now let's see how many listings are available for at least 1 day.¶

In [100]:
available_listings = df1[df1['annual_availability']>0]
len(list(available_listings['annual_availability']))
Out[100]:
4318

Now let's see how many listings are unavailable, that is, 0 days.¶

In [101]:
unavailable_listings = df1[df1['annual_availability']==0]
len(list(unavailable_listings['annual_availability']))
Out[101]:
1188

We see that around 27% of all the listings are unavailable.¶

Plots for all availability¶

In [102]:
listings = df1.groupby('region',observed=False).size().reset_index(name='size')
listings=listings.sort_values(by='size',ascending=False).nlargest(10,'size')

for index, row in listings.iterrows():
    region=row.region
    dfa = df1[df1['region'] == region] 
    
    # Create subplots
    plt.figure(figsize=(16, 5))
    
    # Subplot 1: Available listings
    plt.subplot(121)
    sns.scatterplot(data=dfa, x='longitude', y='latitude', hue=available_listings.annual_availability)
    plt.title(f'Available listings of {region} region')
    plt.ylabel('Latitude')
    plt.xlabel('Longitude')
    
    # Subplot 2: Unavailable listings
    plt.subplot(122)
    sns.scatterplot(data=dfa, x='longitude', y='latitude', hue=unavailable_listings.annual_availability)
    plt.title(f'Unavailable listings of {region} region')
    plt.ylabel('Latitude')
    plt.xlabel('Longitude')
    
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
 

Correlation¶

In [103]:
plt.figure(figsize=(16, 8))
int_columns = df1.select_dtypes(include=['int8','int16','int32', 'int64','float16','float32','float64'])
columns_to_exclude = ['_id', 'longitude','latitude','host_id',] 
int_columns = int_columns.drop(columns=columns_to_exclude, errors='ignore')
corr_matrix = int_columns.corr()
sns.heatmap(corr_matrix,annot=True,cmap="Blues",fmt=".2f")
Out[103]:
<Axes: >
No description has been provided for this image
In [104]:
corr = int_columns.corr()
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True
with sns.axes_style("white"):
    f, ax = plt.subplots(figsize=(57, 15))
    ax = sns.heatmap(corr, mask=mask, square=True, cmap="Blues", annot=True, fmt=".2f")
No description has been provided for this image

This plot used to visualize the relationship between the features in our dataset.

  • Strong Correlation

    1. Bedrooms and accomodates 74%
    2. Beds and accomodates 82%
    3. Bedrooms and Beds 73%
    4. All type of reviews 99%
  • Medium Correlation

    1. Accomodates and Bathroom 57%
    2. Accomodates and Guest included 53%
    3. Bedrooms and Bathrooms 63%
    4. Bedrooms and Guest included 44%
    5. Beds and Bathrooms 60%
    6. Beds and Guest included 46%
    7. Price and Cleaning Fee 51%

Reviews based on country¶

In [105]:
plt.figure(figsize=(16, 6))
reviews = df1.groupby(['country','superhost'],observed=False).agg({'number_of_reviews': 'sum','price':'sum'}).reset_index().sort_values(by='number_of_reviews',ascending=False)
reviews['country'] = reviews['country'].astype(str)
sns.barplot(data=reviews, x="country", y="number_of_reviews",hue='superhost')
plt.show()
No description has been provided for this image

From above we could see that:

  • United States reviews count are higher when comparing to other.
  • Most the reviews are from Non-super hosts.

Host response rate in terms of country¶

In [106]:
plt.figure(figsize=(16, 6))
host_res_rate = df1.groupby(['country','superhost'],observed=False).agg({'host_response_rate': 'mean','price':'sum','host_total_listings_count':'mean'}).reset_index().sort_values(by='host_response_rate',ascending=False)
host_res_rate['country'] = host_res_rate['country'].astype(str)
sns.barplot(data=host_res_rate, x="country", y="host_response_rate",hue='superhost')
plt.show()
No description has been provided for this image

We could see that the superhost's response rate are higher than the non-super host

In [ ]:
 
In [107]:
sns.relplot(data=host_res_rate, x="host_total_listings_count", y="host_response_rate",hue='superhost')
plt.show()
No description has been provided for this image

From above, non-super host are listed most of the listings and response rates are higher for superhost

Paire Plot¶

In [115]:
sns.pairplot(df1[['property_type', 'room_type',  'cancellation_policy', 
        'number_of_reviews','price',
     'security_deposit', 'cleaning_fee',
       'annual_availability']])
Out[115]:
<seaborn.axisgrid.PairGrid at 0x252b1e53b90>
No description has been provided for this image

Exporting processed data for Analysize¶

In [116]:
df1.to_csv('final_df.csv',index=False)
In [ ]: